instacart = read_csv("./data/instacart_train_data.csv.zip") %>%
  clean_names() %>%
  distinct()
## Warning in strptime(x, format, tz = tz): unknown timezone 'default/America/
## New_York'
## Parsed with column specification:
## cols(
##   order_id = col_integer(),
##   product_id = col_integer(),
##   add_to_cart_order = col_integer(),
##   reordered = col_integer(),
##   user_id = col_integer(),
##   eval_set = col_character(),
##   order_number = col_integer(),
##   order_dow = col_integer(),
##   order_hour_of_day = col_integer(),
##   days_since_prior_order = col_integer(),
##   product_name = col_character(),
##   aisle_id = col_integer(),
##   department_id = col_integer(),
##   aisle = col_character(),
##   department = col_character()
## )

Column

Department Frequency

barplot_department_ggplot = instacart %>% 
  count(department) %>% 
  mutate(department = fct_reorder(department, n)) %>% 
  ggplot(aes(x = department, y = n, fill = department)) +
    geom_bar(stat = "identity") + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
    scale_y_continuous(name = "N", labels = scales::unit_format("k", 1e-3)) 

ggplotly(barplot_department_ggplot)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`

Column

Distribution of Order Hour of Day by Department

boxplot_order_hour = instacart %>% 
  ggplot(aes(x = department, y = order_hour_of_day, fill = department)) + 
        geom_boxplot() +
        theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
        labs(title = "Order Hour of Day by Department")

ggplotly(boxplot_order_hour)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`